{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import json\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hansard = glob('/home/husein/ssd3/hansard/hansard.jsonl*.splitted.requested')\n",
    "len(hansard)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malaysia_pdf = glob('/home/husein/ssd3/malaysia-pdf/combine.jsonl*.splitted.requested')\n",
    "len(malaysia_pdf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "22"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malay_news = glob('/home/husein/google-translate-malay-news/*.requested')\n",
    "len(malay_news)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('/home/husein/ssd3/gpt4all/translated-gpt4all-code.json') as fopen:\n",
    "    gpt4all = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train.json', 'w') as fopen_jsonl:\n",
    "    with open('/home/husein/ssd3/codecontext/code-instructions.translated.jsonl') as fopen:\n",
    "        for l in fopen:\n",
    "            data = json.loads(l)\n",
    "            ms = data['src']['instruction']\n",
    "            en = data['r']['result']\n",
    "\n",
    "            ms_tokenized = ms.split()\n",
    "            en_tokenized = en.split()\n",
    "\n",
    "            if len(en_tokenized) and len(en_tokenized) < 1800 and len(ms_tokenized) and len(ms_tokenized) < 1800:\n",
    "                d = {\"translation\": {\"src\": en, \"tgt\": ms, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                d = {\"translation\": {\"src\": ms, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "\n",
    "                fopen_jsonl.flush()\n",
    "            \n",
    "            en = data['src']['output']\n",
    "            d = {\"translation\": {\"src\": en, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "            fopen_jsonl.flush()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"translation\": {\"src\": \"Cipta satu fungsi untuk mengira hasil tambah jujukan integer.\", \"tgt\": \"Create a function to calculate the sum of a sequence of integers.\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n",
      "{\"translation\": {\"src\": \"Create a function to calculate the sum of a sequence of integers.\", \"tgt\": \"Cipta satu fungsi untuk mengira hasil tambah jujukan integer.\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n",
      "{\"translation\": {\"src\": \"# Python code\\ndef sum_sequence(sequence):\\n  sum = 0\\n  for num in sequence:\\n    sum += num\\n  return sum\", \"tgt\": \"# Python code\\ndef sum_sequence(sequence):\\n  sum = 0\\n  for num in sequence:\\n    sum += num\\n  return sum\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 3 train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train.json', 'a') as fopen_jsonl:\n",
    "    with open('/home/husein/ssd3/NSText2SQL/combine.jsonl') as fopen:\n",
    "        for l in fopen:\n",
    "            data = json.loads(l)\n",
    "            if '--' in data['r']['result']:\n",
    "                ms = data['r']['result'].strip()\n",
    "                en = data['src']['question'].strip()\n",
    "                \n",
    "                d = {\"translation\": {\"src\": en, \"tgt\": ms, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                d = {\"translation\": {\"src\": ms, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                \n",
    "                fopen_jsonl.flush()\n",
    "            \n",
    "            \n",
    "            en = data['src']['table']\n",
    "            d = {\"translation\": {\"src\": en, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "            fopen_jsonl.flush()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"translation\": {\"src\": \"Using valid SQLite, answer the following questions for the tables provided above.\\n\\n-- provide the number of patients whose days of hospital stay is greater than 29 and drug name is miconazole powder 2%?\", \"tgt\": \"Menggunakan SQLite yang sah, jawab soalan berikut untuk jadual yang disediakan di atas.\\n\\n-- berikan bilangan pesakit yang hari tinggal di hospital lebih daripada 29 dan nama ubat ialah serbuk miconazole 2%?\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n",
      "{\"translation\": {\"src\": \"Menggunakan SQLite yang sah, jawab soalan berikut untuk jadual yang disediakan di atas.\\n\\n-- berikan bilangan pesakit yang hari tinggal di hospital lebih daripada 29 dan nama ubat ialah serbuk miconazole 2%?\", \"tgt\": \"Using valid SQLite, answer the following questions for the tables provided above.\\n\\n-- provide the number of patients whose days of hospital stay is greater than 29 and drug name is miconazole powder 2%?\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n",
      "{\"translation\": {\"src\": \"CREATE TABLE demographic (\\n    subject_id text,\\n    hadm_id text,\\n    name text,\\n    marital_status text,\\n    age text,\\n    dob text,\\n    gender text,\\n    language text,\\n    religion text,\\n    admission_type text,\\n    days_stay text,\\n    insurance text,\\n    ethnicity text,\\n    expire_flag text,\\n    admission_location text,\\n    discharge_location text,\\n    diagnosis text,\\n    dod text,\\n    dob_year text,\\n    dod_year text,\\n    admittime text,\\n    dischtime text,\\n    admityear text\\n)\\n\\nCREATE TABLE procedures (\\n    subject_id text,\\n    hadm_id text,\\n    icd9_code text,\\n    short_title text,\\n    long_title text\\n)\\n\\nCREATE TABLE prescriptions (\\n    subject_id text,\\n    hadm_id text,\\n    icustay_id text,\\n    drug_type text,\\n    drug text,\\n    formulary_drug_cd text,\\n    route text,\\n    drug_dose text\\n)\\n\\nCREATE TABLE diagnoses (\\n    subject_id text,\\n    hadm_id text,\\n    icd9_code text,\\n    short_title text,\\n    long_title text\\n)\\n\\nCREATE TABLE lab (\\n    subject_id text,\\n    hadm_id text,\\n    itemid text,\\n    charttime text,\\n    flag text,\\n    value_unit text,\\n    label text,\\n    fluid text\\n)\\n\\n\\n\", \"tgt\": \"CREATE TABLE demographic (\\n    subject_id text,\\n    hadm_id text,\\n    name text,\\n    marital_status text,\\n    age text,\\n    dob text,\\n    gender text,\\n    language text,\\n    religion text,\\n    admission_type text,\\n    days_stay text,\\n    insurance text,\\n    ethnicity text,\\n    expire_flag text,\\n    admission_location text,\\n    discharge_location text,\\n    diagnosis text,\\n    dod text,\\n    dob_year text,\\n    dod_year text,\\n    admittime text,\\n    dischtime text,\\n    admityear text\\n)\\n\\nCREATE TABLE procedures (\\n    subject_id text,\\n    hadm_id text,\\n    icd9_code text,\\n    short_title text,\\n    long_title text\\n)\\n\\nCREATE TABLE prescriptions (\\n    subject_id text,\\n    hadm_id text,\\n    icustay_id text,\\n    drug_type text,\\n    drug text,\\n    formulary_drug_cd text,\\n    route text,\\n    drug_dose text\\n)\\n\\nCREATE TABLE diagnoses (\\n    subject_id text,\\n    hadm_id text,\\n    icd9_code text,\\n    short_title text,\\n    long_title text\\n)\\n\\nCREATE TABLE lab (\\n    subject_id text,\\n    hadm_id text,\\n    itemid text,\\n    charttime text,\\n    flag text,\\n    value_unit text,\\n    label text,\\n    fluid text\\n)\\n\\n\\n\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n"
     ]
    }
   ],
   "source": [
    "!tail -n 3 train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "rejected = {\n",
    "    'dependensinya',\n",
    "    'mengkonfiguraskannya',\n",
    "    'bisa',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "with open('train.json', 'a') as fopen_jsonl:\n",
    "    with open('/home/husein/ssd3/codecontext/combine.translated.jsonl') as fopen:\n",
    "        for l in fopen:\n",
    "            data = json.loads(l)\n",
    "            en = data['instruction'] or ''\n",
    "            ms = data['instruction_ms'] or ''\n",
    "            overlap = set(ms.lower().split()) & rejected\n",
    "            \n",
    "            if len(en) and len(ms) and not len(overlap):\n",
    "                d = {\"translation\": {\"src\": en, \"tgt\": ms, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                d = {\"translation\": {\"src\": ms, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                \n",
    "                fopen_jsonl.flush()\n",
    "            \n",
    "            en = data['output'] or ''\n",
    "            ms = data['output_ms'] or ''\n",
    "            overlap = set(ms.lower().split()) & rejected\n",
    "                \n",
    "            if len(en) and len(ms) and not len(overlap):\n",
    "                d = {\"translation\": {\"src\": en, \"tgt\": ms, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                d = {\"translation\": {\"src\": ms, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                \n",
    "                fopen_jsonl.flush()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"translation\": {\"src\": \"Buatlah sebuah struktur bernama \\\"CartItem\\\" dengan properti untuk nama item, harga, dan kuantiti.\\nInput: Tiada.\", \"tgt\": \"Create a struct named \\\"CartItem\\\" with properties for item name, price, and quantity.\\nInput: None.\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n",
      "{\"translation\": {\"src\": \"Solution: To create the \\\"CartItem\\\" struct with properties for item name, price, and quantity, I will write the following code:\\n\\n```\\nstruct CartItem {\\n    let name: String\\n    let price: Double\\n    var quantity: Int\\n}\\n```\", \"tgt\": \"Untuk mencipta struktur \\\"CartItem\\\" dengan properti untuk nama item, harga, dan kuantiti, saya akan menulis kod berikut:\\n\\n```\\nstruct CartItem {\\n    let name: String\\n    let price: Double\\n    var quantity: Int\\n}\\n```\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n",
      "{\"translation\": {\"src\": \"Untuk mencipta struktur \\\"CartItem\\\" dengan properti untuk nama item, harga, dan kuantiti, saya akan menulis kod berikut:\\n\\n```\\nstruct CartItem {\\n    let name: String\\n    let price: Double\\n    var quantity: Int\\n}\\n```\", \"tgt\": \"Solution: To create the \\\"CartItem\\\" struct with properties for item name, price, and quantity, I will write the following code:\\n\\n```\\nstruct CartItem {\\n    let name: String\\n    let price: Double\\n    var quantity: Int\\n}\\n```\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n"
     ]
    }
   ],
   "source": [
    "!tail -n 3 train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "mss = [hansard, malaysia_pdf, malay_news]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "27971it [00:02, 11428.78it/s]\n",
      "28150it [00:02, 10816.26it/s]\n",
      "28083it [00:02, 10101.77it/s]\n",
      "25223it [00:02, 10029.90it/s]\n",
      "28314it [00:02, 10586.61it/s]\n",
      "26720it [00:01, 25859.86it/s]\n",
      "87295it [00:01, 49416.97it/s]\n",
      "83589it [00:01, 51051.00it/s]\n",
      "17297it [00:00, 28762.78it/s]\n",
      "59356it [00:01, 43897.52it/s]\n",
      "87609it [00:00, 99846.28it/s] \n",
      "100000it [00:01, 99625.00it/s]\n",
      "100000it [00:01, 96666.94it/s]\n",
      "100000it [00:01, 95764.30it/s]\n",
      "99897it [00:00, 113799.93it/s]\n",
      "99999it [00:01, 99437.39it/s] \n",
      "99992it [00:01, 99910.49it/s] \n",
      "99994it [00:00, 106553.47it/s]\n",
      "99997it [00:01, 93846.31it/s] \n",
      "99998it [00:00, 111930.91it/s]\n",
      "100000it [00:01, 90188.20it/s]\n",
      "99989it [00:01, 91259.37it/s] \n",
      "100000it [00:01, 97826.22it/s]\n",
      "99994it [00:00, 100970.34it/s]\n",
      "100000it [00:01, 96476.09it/s]\n",
      "100000it [00:01, 97835.87it/s]\n",
      "100000it [00:00, 106273.62it/s]\n",
      "100000it [00:01, 98723.21it/s]\n",
      "100000it [00:00, 100089.18it/s]\n",
      "100000it [00:01, 98504.27it/s]\n",
      "100000it [00:00, 102575.80it/s]\n",
      "99995it [00:01, 98316.98it/s] \n"
     ]
    }
   ],
   "source": [
    "with open('train.json', 'a') as fopen_jsonl:\n",
    "    for ms in mss:\n",
    "        for f in ms:\n",
    "            with open(f) as fopen:\n",
    "                for l in tqdm(fopen):\n",
    "                    data = json.loads(l)\n",
    "                    ms = data['src']\n",
    "                    en = data['r']['result']\n",
    "\n",
    "                    ms_tokenized = ms.split()\n",
    "                    en_tokenized = en.split()\n",
    "\n",
    "                    if len(en_tokenized) and len(en_tokenized) < 1800 and len(ms_tokenized) and len(ms_tokenized) < 1800:\n",
    "                        d = {\"translation\": {\"src\": en, \"tgt\": ms, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                        fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                        d = {\"translation\": {\"src\": ms, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                        fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "\n",
    "                        fopen_jsonl.flush()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train.json', 'a') as fopen_jsonl:\n",
    "    for l in gpt4all:\n",
    "        try:\n",
    "            en = l['prompt'][0]\n",
    "            ms = l['prompt'][1]\n",
    "            overlap = set(ms.lower().split()) & rejected\n",
    "\n",
    "            if len(en) and len(ms) and not len(overlap):\n",
    "                d = {\"translation\": {\"src\": en, \"tgt\": ms, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "\n",
    "                d = {\"translation\": {\"src\": ms, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "\n",
    "            en = l['response'][0]\n",
    "            ms = l['response'][1]\n",
    "            overlap = set(ms.lower().split()) & rejected\n",
    "\n",
    "            if len(en) and len(ms) and not len(overlap):\n",
    "                d = {\"translation\": {\"src\": en, \"tgt\": ms, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "\n",
    "                d = {\"translation\": {\"src\": ms, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "        except Exception as e:\n",
    "            print(e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "!shuf train.json > shuffled-train.json\n",
    "!head -n 1000 shuffled-train.json > test.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"translation\": {\"src\": \"\\\"Saya berada di rumah dan menunggu sehingga boleh kembali bersama pasukan untuk berusaha untuk pasukan ini\\\".\", \"tgt\": \"\\\"I'm at home and waiting until I can get back with the team to work for this team\\\".\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n",
      "{\"translation\": {\"src\": \"Pengasas Bersamanya, Shazwan Abdul Razak, berkata syarikat yang memulakan operasi e-skuter di Iskandar Puteri, Johor pada Jun tahun ini sudah memiliki 320 unit mobiliti berkenaan.\", \"tgt\": \"Its Co-Founder, Shazwan Abdul Razak, said the company that started e-scooter operations in Iskandar Puteri, Johor in June this year already has 320 mobility units.\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n",
      "{\"translation\": {\"src\": \"Tulis program CSS untuk reka bentuk responsif pada halaman web.\", \"tgt\": \"Write a CSS program for responsive design on a webpage.\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n",
      "{\"translation\": {\"src\": \"Even though I put my foot on top of his leg, I can still feel the pressure exerted.\", \"tgt\": \"Walaupun saya meletakkan kaki saya di atas kakinya, saya masih boleh merasa tekanan yang diberikan.\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n",
      "{\"translation\": {\"src\": \"GUA MUSANG - Lebih 30 wakil badan bukan kerajaan (NGO) mengadakan solat sunat hajat memohon doa supaya dipermudahkan urusan dan tugas bantuan kepada penduduk Orang Asli suku kaum Batek di Kampung Kuala Koh, di sini, hari ini.\", \"tgt\": \"GUA MUSANG - More than 30 representatives of non-governmental organizations (NGOs) held prayers asking for prayers to facilitate the affairs and tasks of helping the Orang Asli residents of the Batek tribe in Kampung Kuala Koh, here, today.\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n",
      "{\"translation\": {\"src\": \"\\\"The International Olympic Committee is responsible for decisions regarding athletes participating in the London Olympics,\\\" the IWF said.\", \"tgt\": \"\\\"Jawatankuasa Olimpik Antarabangsa bertanggungjawab terhadap keputusan berhubung atlet yang menyertai Olimpik London,\\\" kata IWF.\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n",
      "{\"translation\": {\"src\": \"\\\"He did not give any letter to the local authority (PBT) or inform me that he was ill,\\\" he said.\", \"tgt\": \"\\\"Beliau tidak memberi sebarang surat kepada pihak berkuasa tempatan (PBT) atau memaklumkan kepada saya yang dia sakit,\\\" katanya.\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n",
      "{\"translation\": {\"src\": \"CREATE TABLE table_name_1 (\\n    constructor VARCHAR,\\n    fastest_lap VARCHAR\\n)\\n\\n\\n\", \"tgt\": \"CREATE TABLE table_name_1 (\\n    constructor VARCHAR,\\n    fastest_lap VARCHAR\\n)\\n\\n\\n\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n",
      "{\"translation\": {\"src\": \"Online previously reported, a viral video of a man admitting to having a same-sex relationship with an individual who looks like a minister.\", \"tgt\": \"Online terdahulu melaporkan, sebuah video tular seorang lelaki mengaku mengadakan hubungan sejenis dengan individu mirip seorang menteri.\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n",
      "{\"translation\": {\"src\": \"Isu pencerobohan bot-bot kenka di kawasan perairan berkenaan berlaku sejak 2007 dan masih berterusan hingga kini kerana tiada tindakan proaktif dan berkesan oleh agensi penguatkuasaan, menyebabkan pengusaha bot mengambil kesempatan mencerobohi kawasan tangkapan ikan yang mereka tidak dibenarkan beroperasi, katanya.\", \"tgt\": \"The issue of encroachment by kenka boats in the waters in question has been happening since 2007 and is still continuing until now because there is no proactive and effective action by enforcement agencies, causing boat operators to take the opportunity to encroach on fishing areas where they are not allowed to operate, he said.\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 10 shuffled-train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"translation\": {\"src\": \"\\\"You can be in your office or in your respective area and still be able to follow the Dewan Rakyat meeting because it is broadcast live,\\\" he said.\", \"tgt\": \"\\\"Anda boleh berada di pejabat atau di kawasan masing-masing dan masih boleh mengikuti persidangan Dewan Rakyat kerana ia disiarkan secara langsung,\\\" katanya.\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n",
      "{\"translation\": {\"src\": \"7. The government is confident that the Vision of Shared Prosperity will succeed in shaping Malaysia as a united, prosperous and dignified country and subsequently emerge as the axis of the Asian economy.\", \"tgt\": \"7. Kerajaan yakin Wawasan Kemakmuran Bersama akan berjaya membentuk Malaysia sebagai sebuah negara yang bersatu, makmur dan bermaruah seterusnya muncul sebagai paksi ekonomi Asia.\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n",
      "{\"translation\": {\"src\": \"I'm not the type of person to run away from issues.\", \"tgt\": \"Saya bukan jenis orang yang lari daripada isu.\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n",
      "{\"translation\": {\"src\": \"\\\"Apabila datang sahaja ambulans Malaysia, petugas Arab Saudi sangat puji dan mereka melahirkan rasa kepercayaan terhadap Malaysia.\", \"tgt\": \"\\\"When the Malaysian ambulance came, the Saudi Arabian staff were very complimentary and expressed their trust in Malaysia.\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n",
      "{\"translation\": {\"src\": \"Using valid SQLite, answer the following questions for the tables provided above.\\n\\n-- Please use a bar chart to show the average amount of payment by each payment method code, rank x-axis from low to high order.\", \"tgt\": \"Menggunakan SQLite yang sah, jawab soalan berikut untuk jadual yang disediakan di atas.\\n\\n-- Sila gunakan carta bar untuk menunjukkan purata amaun pembayaran mengikut setiap kod kaedah pembayaran, kedudukan paksi x daripada pesanan rendah ke tinggi.\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n",
      "{\"translation\": {\"src\": \"Mana yg sempat Paknil ScreenCap.\", \"tgt\": \"Who has time to Paknil ScreenCap.\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n",
      "{\"translation\": {\"src\": \"Daphne Tan, Bernice Lim, New Hui Fen dan Cherie Tan dari Singapura di tempat kedua dengan 2,443 jatuhan pin manakala Putty Armein, Shinta Yunita, Sharon Limansantoso dan Tannya Roumpiper dari Indonesia di tempat ketiga dengan mengutip 2,346 pin.\", \"tgt\": \"Daphne Tan, Bernice Lim, New Hui Fen and Cherie Tan from Singapore are in second place with 2,443 pin drops while Putty Armein, Shinta Yunita, Sharon Limansantoso and Tannya Roumpiper from Indonesia are in third place with 2,346 pins.\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n",
      "{\"translation\": {\"src\": \"Gamuda Bhd founder Koon Yew Yin was present at the Jelapang Police Station in Perak today regarding his statement against the Malaysian Armed Forces (ATM) which was considered by some to be insulting.\", \"tgt\": \"Pengasas Gamuda Bhd Koon Yew Yin hadir di Balai Polis Jelapang di Perak hari ini berhubung kenyataannya terhadap Angkatan Tentera Malaysia (ATM) yang dianggap sesetengah pihak sebagai menghina.\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n",
      "{\"translation\": {\"src\": \"(a) Komitmen Kementerian memanjangkan bantuan pengudaraan, penapis HEPA dan saringan COVID-19 kepada pusat penjagaan warga emas, taska dan pusat penjagaan kanak-kanak; dan.\", \"tgt\": \"(a) The Ministry's commitment to extend ventilation assistance, HEPA filters and COVID-19 screening to care centers for the elderly, nurseries and child care centers; and.\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n",
      "{\"translation\": {\"src\": \"- Cerapan Lokasi Jalan pada Tahun 2011 . (18.07.2019) Pesisir Pantai TAMP . - Cerapan Lokasi Jalan yang Runtuh Sepanjang 232.63 meter . (18.07.2019) Kawasan Pembangunan TAMP - Keadaan Jalan Raya yang Runtuh Akibat Hakisan Pantai  . (18.07.2019) (Sumber: Jabatan Audit Negara) iv. Kejadian jalan runtuh akibat hakisan pantai ini disebabkan tidak ada langkah pencegahan atau kawalan hakisan pantai diambil sebelum pembinaan jalan dilaksanakan. Berdasarkan kepada imej cerapan yang diperoleh daripada pihak MYSA, jarak jalan yang dibina dengan pesisir . pantai hanya sejauh 29 meter seperti dalam Gambar 8. Sehingga bulan Ogos 2019, belum ada kerja untuk mengawal hakisan pantai dilaksanakan. Selain itu, berdasarkan Laporan National Coastal Erosion . Study tahun 2015, pesisir pantai negeri Pahang telah terhakis lebih kurang 61.8 km dengan kadar purata maksimum hakisan pantai tahunan pula adalah melebihi 4 meter setahun.\", \"tgt\": \"- Road Location Insights in 2011. (18.07.2019) TAMP Beach Coast. - Observation of the location of the collapsed road along 232.63 meters. (18.07.2019) TAMP Development Area - Road Conditions Collapsed Due to Coastal Erosion. (18.07.2019) (Source: National Audit Department) iv. The incident of road collapse due to coastal erosion is due to the fact that no measures to prevent or control coastal erosion were taken before road construction was carried out. Based on the observation image obtained from the MYSA, the distance between the built road and the coast. the beach is only as far as 29 meters as in Picture 8. Until August 2019, no work has been carried out to control beach erosion. In addition, based on the National Coastal Erosion Report. Study in 2015, the coastal state of Pahang has eroded approximately 61.8 km with the maximum average rate of annual coastal erosion exceeding 4 meters per year.\", \"prefix\": \"terjemah ke Inggeris: \"}}\r\n"
     ]
    }
   ],
   "source": [
    "!tail -n 10 shuffled-train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
